from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import json
import torch
import random
import numpy as np

# reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

results = []
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-72B-Instruct")
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,        # Enable 8-bit quantization
    llm_int8_threshold=6.0,   # (Optional) Default threshold for LLM.int8()
    llm_int8_skip_modules=None, # (Optional) Skip quantization for specific modules
)

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-72B-Instruct", device_map="auto", quantization_config=bnb_config)

with open('prompt_entities.txt', 'r') as f:
    topics = [line.strip() for line in f.readlines()] 

for topic in topics:  
    content = "Tell me a paragraph bio of: " + topic + ". "
    messages = [{"role": "user", "content": content}]

    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model.device)
    
    terminators = [
        tokenizer.eos_token_id,
    ]
    outputs = model.generate(
        input_ids, max_new_tokens=500, eos_token_id=terminators, 
        do_sample=True, temperature=0.9, pad_token_id=tokenizer.eos_token_id, num_return_sequences=5
    )
    generations = [tokenizer.decode(decoded[input_ids.shape[-1]:], skip_special_tokens=True).strip() for decoded in outputs]
    results.append({'Prompt': topic, 'Responses': generations})

with open('bio_qwen72b.json', 'w') as json_file:
    json.dump(results, json_file, indent=4)